*-------------------------------------------------------------------------------
*						Data Pre-Process
*-------------------------------------------------------------------------------

** Set Path
global Raw_data    	"G:\project-finished\Descriptive\Data"
global App_data    	"G:\project-finished\Descriptive\Appendix Data"
global Class_data   "G:\project-finished\Descriptive\Classification"  
global Work_lab   	"G:\project-finished\Descriptive\Lab"
global Out_lab    	"G:\project-finished\Descriptive\Out"  

cd "$Work_lab"
                            
capture log close            
log using "$Out_lab\Pre-industry", replace 
set more off

*-------------------------------------------------------------------------------
*						Step 1: Chinese CIC consistent
*-------------------------------------------------------------------------------

*-------------------------------------------------------------------------------
*2000--GBT4754-1994
*2005--GBT4754-2002
*2010--GBT4754-2002
*2015--GBT4754-2011
*-------------------------------------------------------------------------------


*我们先给出中类的匹配--行业名称，看能配上多少///没有多少能够直接配上的

*2000年的行业
use "$Class_data\行业标准转换\cic02_3dnames_new.dta",clear
rename cic_02_3d industry
rename ind_3dname industry_name
drop note
save "2000industry_分类.dta",replace

*2010年的行业
import excel "$Class_data\行业标准转换\2010industry_分类.xlsx", sheet("Sheet2") firstrow allstring clear
replace 中类="0"+中类 if real(中类)<100
rename 中类  industry
rename 名称 industry_name
save "2010industry_分类.dta",replace

*2015年的行业
import excel "$Class_data\行业标准转换\2015industry_分类.xlsx", sheet("中类") firstrow allstring clear
replace 中类="0"+中类 if real(中类)<100
rename 中类  industry
rename 类别名称 industry_name
save "2015industry_分类.dta",replace

*------------------2002-1994----------------------------------------------

*我们直接基于4dig的匹配--行业名称，看能配上多少///没有多少能够直接配上的

import excel "$Class_data\行业标准转换\国民经济行业分类和代码表\2002与1994版《国民经济行业分类》国家标准对照表.xlsx", sheet("Sheet1") firstrow allstring clear

drop if post_02ind==""& post_02ind_name==""& pre_94ind==""& pre_94ind_name==""
carryforward post_02ind ,gen(post_02ind2) extmiss
carryforward post_02ind_name ,gen(post_02ind_name2) extmiss

replace post_02ind=post_02ind2 if post_02ind!=post_02ind2
replace post_02ind_name=post_02ind_name2 if post_02ind_name!=post_02ind_name2

drop post_02ind2 post_02ind_name2

*然后对大类缺少“0”的进行补足
gen tag=_n
order tag
replace post_02ind="0"+post_02ind if tag<=98
replace pre_94ind="0"+pre_94ind if strlen(pre_94ind)==3
drop tag

drop if strlen(post_02ind)<4
gen post02_3dig=substr(post_02ind,1,3)
gen pre_94_3dig=substr(pre_94ind,1,3)

gen pair=_n

bys pre_94_3dig:egen number=count(pair)
bys post02_3dig pre_94_3dig:egen temp=count(pair)
gen share=temp/number  //即94年的职业能够最大程度对应到02年的
keep post02_3dig pre_94_3dig share
duplicates drop 

bys pre_94_3dig:egen tag=max(share)

gen pre_match=post02_3dig if share==tag

*如果存在一半一半的对应，则我们以数值最相近的为主
drop if pre_match==""
bys pre_94:egen du=count(pre_match)
sort du pre_94
gen new=pre_match if du==1

gen near=abs(real(post02_3dig)-real(pre_94_3dig))
bys pre_94: egen choose=min(near)

replace new=pre_match if choose==near
drop du choose near
drop if new==""
keep pre_94_3dig new
save "cic94_to_cic02.dta",replace

*------------------2011-2002----------------------------------------------
import excel "$Class_data\行业标准转换\国民经济行业分类和代码表\2011与2002版《国民经济行业分类》国家标准对照表.xlsx", sheet("Sheet1") firstrow allstring clear

drop if post_11ind==""& post_11ind_name==""& pre_02ind==""& pre_02ind_name==""
carryforward post_11ind ,gen(post_11ind2) extmiss
carryforward post_11ind_name ,gen(post_11ind_name2) extmiss

replace post_11ind=post_11ind2 if post_11ind!=post_11ind2
replace post_11ind_name=post_11ind_name2 if post_11ind_name!=post_11ind_name2

drop post_11ind2 post_11ind_name2


drop if strlen(post_11ind)<4
gen post11_3dig=substr(post_11ind,1,3)
gen pre02_3dig=substr(pre_02ind,1,3)

gen pair=_n

bys post11_3dig:egen number=count(pair)
bys post11_3dig pre02_3dig:egen temp=count(pair)
gen share=temp/number  //即94年的职业能够最大程度对应到02年的
keep post11_3dig pre02_3dig share
duplicates drop 

bys post11_3dig:egen tag=max(share)

gen pre_match=pre02_3dig if share==tag

*如果存在一半一半的对应，则我们以数值最相近的为主
drop if pre_match==""
bys post:egen du=count(pre_match)
sort du post
gen new=pre_match if du==1

gen near=abs(real(post11_3dig)-real(pre02_3dig))
bys post11_3dig: egen choose=min(near)

replace new=pre_match if choose==near
drop du choose near
drop if new==""
keep post11_3dig new
save "cic11_to_cic02.dta",replace

*然后我们匹配上行业名称，因为主要是以2002年为基准，因此主要采用2002年的行业名称
use "2010industry_分类.dta",clear
rename (industry industry_name) (new new_title)
save "industry_match_name.dta",replace

*然后我们构建一个最全的三年行业代码匹配

use "cic94_to_cic02.dta",clear
append using "cic11_to_cic02.dta"

merge m:1 new using "industry_match_name.dta"
drop _m

save "$Class_data\行业标准转换\census2000-2015_行业统一版本.dta",replace


erase "industry_match_name.dta"
erase "cic94_to_cic02.dta"
erase "cic11_to_cic02.dta"
erase "2010industry_分类.dta"
erase "2000industry_分类.dta"
erase "2015industry_分类.dta"

*-------------------------------------------------------------------------------
*			Step 2: Chinese CIC consistent to USA CIC consistent
*-------------------------------------------------------------------------------
/*调整ACS行业分类*/
import excel "$Class_data\industry correspondance\acs industry correspondance.xlsx", sheet("Sheet1") firstrow allstring clear
compress
replace IND2000="0"+IND2000 if strlen(IND2000)==2
replace IND2010="0"+IND2010 if strlen(IND2010)==3
replace IND2015="0"+IND2015 if strlen(IND2015)==3
save acs_industry_correspondance.dta,replace

/*中国行业与调整过后的统一行业分类*/
import excel "$Class_data\industry correspondance\cic acsindustry correspondance.xlsx", sheet("CIC-consistent") firstrow allstring clear
keep new new_title 最终调整 部门分类
duplicates drop
save cic_acsind_chn.dta,replace

/*美国行业与调整过后的统一行业分类*/
//2000
import excel "$Class_data\industry correspondance\cic acsindustry correspondance.xlsx", sheet("CIC-consistent") firstrow allstring clear
keep IND2000 最终调整 部门分类
replace IND2000="0"+IND2000 if strlen(IND2000)==2
drop if IND2000==""
duplicates drop
save cic_acsind_usa2000.dta,replace

//2010
import excel "$Class_data\industry correspondance\cic acsindustry correspondance.xlsx", sheet("CIC-consistent") firstrow allstring clear
keep IND2010 最终调整 部门分类
replace IND2010="0"+IND2010 if strlen(IND2010)==3
drop if IND2010==""
duplicates drop
save cic_acsind_usa2010.dta,replace
//2015
import excel "$Class_data\industry correspondance\cic acsindustry correspondance.xlsx", sheet("CIC-consistent") firstrow allstring clear
keep IND2015 最终调整 部门分类
replace IND2015="0"+IND2015 if strlen(IND2015)==3
drop if IND2015==""
duplicates drop
save cic_acsind_usa2015.dta,replace

erase acs_industry_correspondance.dta

log close